#!user/bin/python
# coding: utf-8

from goose import Goose
import newspaper


class Crawler:
    def get_src_html(self, url):
        ''''''
        g = Goose()
        news = g.extract(url=url)
        return news.raw_html

    def get_content_by_Goose(self, url):
        g = Goose()
        article = g.extract(url=url)
        return article.cleaned_text

    def get_content_by_newspaper(self,url):
        try:
            article = newspaper.build_article(url)
            article.download()
            article.article_html
            article.parse()
            return article.text
        except Exception, e:
            print "Download Error or Parse Error"
            return u""

if __name__ == "__main__":
    cr = Crawler()
    url = u'https://www.usgs.gov/news/us-vietnamese-science-agencies-partner-protect-biodiversity-vietnam'
    print cr.get_content_by_Goose(url)